Filter criteria:

glue('Number of genes: ', nrow(datExpr), '\n',
     'Number of samples: ', ncol(datExpr), ' (', sum(datMeta$Diagnosis_=='ASD'), ' ASD, ',
     sum(datMeta$Diagnosis_!='ASD'), ' controls)')
## Number of genes: 2928
## Number of samples: 86 (51 ASD, 35 controls)

Dimensionality reduction using PCA

reduce_dim_datExpr = function(datExpr, datMeta, var_explained=0.8, filter_controls=FALSE){

  datExpr = data.frame(datExpr)
  
  if(filter_controls){
    datMeta = datMeta %>% filter(Diagnosis_=='ASD')
    datExpr = datExpr %>% select(paste0('X', datMeta_ASD$Dissected_Sample_ID))
  }
  
  datExpr_pca = prcomp(t(datExpr), scale=TRUE)
  last_pc = data.frame(summary(datExpr_pca)$importance[3,]) %>% rownames_to_column(var='ID') %>% 
    filter(.[[2]] >= var_explained) %>% top_n(-1, ID)
  
  par(mfrow=c(1,2))
  plot(summary(datExpr_pca)$importance[2,], type='b')
  abline(v=substr(last_pc$ID, 3, nchar(last_pc$ID)), col='blue')
  plot(summary(datExpr_pca)$importance[3,], type='b')
  abline(h=var_explained, col='blue')
  
  print(glue('Keeping top ', substr(last_pc$ID, 3, nchar(last_pc$ID)), ' components that explain ',
             var_explained*100, '% of the variance'))
  
  datExpr_top_pc = datExpr_pca$x %>% data.frame %>% dplyr::select(PC1:last_pc$ID)
  
  return(list('datExpr'=datExpr_top_pc, 'datMeta'=datMeta, 'pca_output'=datExpr_pca))
}

reduce_dim_output = reduce_dim_datExpr(datExpr, datMeta)

## Keeping top 10 components that explain 80% of the variance
datExpr_redDim = reduce_dim_output$datExpr
datMeta_redDim = reduce_dim_output$datMeta
pca_output = reduce_dim_output$pca_output

rm(datSeq, datProbes, reduce_dim_datExpr, reduce_dim_output, datExpr, datMeta)

Clustering

clusterings = list()

K-means Clustering

No recognisable best k, so chose k=5

set.seed(123)
wss = sapply(1:15, function(k) kmeans(datExpr_redDim, k, nstart=25)$tot.withinss)
plot(wss, type='b', main='K-Means Clustering')
best_k = 4
abline(v = best_k, col='blue')

datExpr_k_means = kmeans(datExpr_redDim, best_k, nstart=25)
clusterings[['km']] = datExpr_k_means$cluster

Hierarchical Clustering

Chose k=7 as best number of clusters.

Clusters seem to be able to separate ASD and control samples pretty well and there are no noticeable patterns regarding sex, age or brain region in any cluster.

Younger ASD samples seem to be more similar to control samples than older ASD samples (pink cluster has most of the youngest samples), perhaps oder samples were only diagnosed in extreme cases and now milder ASD cases get diagnosed a well and milder cases have milder gene expression differences? The yellow cluster is made of young ASD samples. Most old ASD samples are also close together.

Colors:

  • Diagnosis: Blue=control, Green=ASD

  • Sex: Pink=Female, Blue=Male

  • Brain region: Pink=Frontal, Green=Temporal, Blue=Parietal, Purple=Occipital

  • Age: Purple=youngest, Yellow=oldest

h_clusts = datExpr_redDim %>% dist %>% hclust %>% as.dendrogram
# h_clusts %>% plot
best_k = 6
clusterings[['hc']] = cutree(h_clusts, best_k)

create_viridis_dict = function(age){
  min_age = datMeta_redDim$Age %>% min
  max_age = datMeta_redDim$Age %>% max
  viridis_age_cols = viridis(max_age - min_age + 1)
  names(viridis_age_cols) = seq(min_age, max_age)
  
  return(viridis_age_cols)
}
viridis_age_cols = create_viridis_dict()

dend_meta = datMeta_redDim[match(gsub('X','',labels(h_clusts)), rownames(datMeta_redDim)),] %>% 
            mutate('Diagnosis' = ifelse(Diagnosis_=='CTL','#008080','#86b300'), # Blue control, Green ASD
                   'Sex' = ifelse(Sex=='F','#ff6666','#008ae6'),                # Pink Female, Blue Male
                   'Region' = case_when(Brain_lobe=='Frontal'~'#F8766D',        # ggplot defaults for 4 colours
                                        Brain_lobe=='Temporal'~'#7CAE00',
                                        Brain_lobe=='Parietal'~'#00BFC4',
                                        Brain_lobe=='Occipital'~'#C77CFF'),
                   'Age' = viridis_age_cols[as.character(Age)]) %>%            # Purple: young, Yellow: old
            dplyr::select(Age, Region, Sex, Diagnosis)
h_clusts %>% set('labels', rep('', nrow(datMeta_redDim))) %>% set('branches_k_color', k=best_k) %>% plot
colored_bars(colors=dend_meta)

Consensus Clustering

Samples are grouped into two big clusters. It wasn’t clear which was the best number of subclusters for the first one, but the second one was clearer. Chose 6 and 8, respectively.

*Output plots in clustering_samples_03_20 folder

Independent Component Analysis

Following this paper’s guidelines:

  1. Run PCA and keep enough components to explain 60% of the variance

  2. Run ICA with that same number of nbComp as principal components kept to then filter them

  3. Select components with kurtosis > 3

  4. Assign obs to genes with FDR<0.001 using the fdrtool package

## Step 1... determine cutoff point
## Step 2... estimate parameters of null distribution and eta0
## Step 3... compute p-values and estimate empirical PDF/CDF
## Step 4... compute q-values and local fdr
## 
## Step 1... determine cutoff point
## Step 2... estimate parameters of null distribution and eta0
## Step 3... compute p-values and estimate empirical PDF/CDF
## Step 4... compute q-values and local fdr
## 
## Step 1... determine cutoff point
## Step 2... estimate parameters of null distribution and eta0
## Step 3... compute p-values and estimate empirical PDF/CDF
## Step 4... compute q-values and local fdr
## 
## Step 1... determine cutoff point
## Step 2... estimate parameters of null distribution and eta0
## Step 3... compute p-values and estimate empirical PDF/CDF
## Step 4... compute q-values and local fdr
## 
## Step 1... determine cutoff point
## Step 2... estimate parameters of null distribution and eta0
## Step 3... compute p-values and estimate empirical PDF/CDF
## Step 4... compute q-values and local fdr
## 
## Step 1... determine cutoff point
## Step 2... estimate parameters of null distribution and eta0
## Step 3... compute p-values and estimate empirical PDF/CDF
## Step 4... compute q-values and local fdr
## 
## Step 1... determine cutoff point
## Step 2... estimate parameters of null distribution and eta0
## Step 3... compute p-values and estimate empirical PDF/CDF
## Step 4... compute q-values and local fdr
## 
## Step 1... determine cutoff point
## Step 2... estimate parameters of null distribution and eta0
## Step 3... compute p-values and estimate empirical PDF/CDF
## Step 4... compute q-values and local fdr
## 
## Step 1... determine cutoff point
## Step 2... estimate parameters of null distribution and eta0
## Step 3... compute p-values and estimate empirical PDF/CDF
## Step 4... compute q-values and local fdr
## 
## Step 1... determine cutoff point
## Step 2... estimate parameters of null distribution and eta0
## Step 3... compute p-values and estimate empirical PDF/CDF
## Step 4... compute q-values and local fdr

Not a good method for clustering samples because:

  1. ICA does not perform well with small samples (see Figure 4 of this paper)

  2. Warnings: (Warning in fdrtool(x, plot = F): There may be too few input test statistics for reliable FDR calculations!)

  3. Leaves almost half of the observations (40) without a cluster, which is better than with the past datasets, but it’s still a lot:

ICA_clusters %>% rowSums %>% table
## .
##  0  1  2  3 
## 40 33  8  5
ICA_clusters %>% mutate(cl_sum=rowSums(.)) %>% as.matrix %>% melt %>% ggplot(aes(Var2,Var1)) + 
  geom_tile(aes(fill=value)) + xlab('Clusters') + ylab('Samples') + 
  theme_minimal() + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) + coord_flip()

WGCNA

SFT.R.sq starts in 0.8 with power=1 but at 2 decreases to 0.28 and starts slowly growing. Best power=21.

best_power = datExpr_redDim %>% t %>% pickSoftThreshold(powerVector = seq(2, 26, by=2))
##    Power SFT.R.sq   slope truncated.R.sq mean.k. median.k. max.k.
## 1      2   0.7370  0.6040          0.770  28.800    30.900  45.20
## 2      4   0.0282 -0.0418         -0.247  15.400    15.400  30.50
## 3      6   0.6200 -0.3230          0.582   9.600     8.730  22.50
## 4      8   0.5840 -0.4680          0.598   6.510     5.400  17.40
## 5     10   0.6040 -0.7360          0.634   4.680     3.550  14.00
## 6     12   0.6610 -0.7830          0.676   3.500     2.390  11.50
## 7     14   0.6230 -0.9060          0.580   2.710     1.720   9.67
## 8     16   0.7400 -0.9990          0.737   2.150     1.300   8.26
## 9     18   0.8600 -0.9430          0.916   1.740     0.967   7.15
## 10    20   0.1270 -2.0500         -0.122   1.440     0.806   6.26
## 11    22   0.8370 -1.0600          0.870   1.200     0.667   5.54
## 12    24   0.8810 -1.1000          0.921   1.020     0.581   4.95
## 13    26   0.8660 -1.0700          0.888   0.875     0.505   4.45
network = datExpr_redDim %>% t %>% blockwiseModules(power=best_power$powerEstimate, numericLabels=TRUE)
##      mergeCloseModules: less than two proper modules.
##       ..color levels are 0, 1
##       ..there is nothing to merge.
clusterings[['WGCNA']] = network$colors
names(clusterings[['WGCNA']]) = rownames(datExpr_redDim)

It finds a single cluster grouping 66 observations and leaves the rest without cluster:

table(clusterings[['WGCNA']])
## 
##  0  1 
## 20 66

Gaussian Mixture Models with hard thresholding

Points don’t seem to follow a Gaussian distribution no matter the number of clusters, chose 4 points following the best k from K-means because the methods are similar

n_clust = datExpr_redDim %>% Optimal_Clusters_GMM(max_clusters=80, criterion='BIC', plot_data=FALSE)
plot(n_clust, type='l', main='Bayesian Information Criterion to choose number of clusters')

best_k = 4 # copying k-means best_k
gmm = datExpr_redDim %>% GMM(best_k)
clusterings[['GMM']] = gmm$Log_likelihood %>% apply(1, which.max)

Plot of clusters with their centroids in gray

gmm_points = rbind(datExpr_redDim, setNames(data.frame(gmm$centroids), names(datExpr_redDim)))
gmm_labels = c(clusterings[['GMM']], rep(NA, best_k)) %>% as.factor
ggplotly(gmm_points %>% ggplot(aes(x=PC1, y=PC2, color=gmm_labels)) + geom_point() + theme_minimal())
rm(wss, datExpr_k_means, h_clusts, cc_output, cc_output_c1, cc_output_c2, best_k, ICA_output, 
   ICA_clusters_names, signals_w_kurtosis, n_clust, gmm, gmm_points, gmm_labels, network, dend_meta, 
   best_power, c, viridis_age_cols, create_viridis_dict)

Compare clusterings

Using Adjusted Rand Index:

clusters_plus_phenotype = clusterings
clusters_plus_phenotype[['ICA_NA']] = is.na(clusters_plus_phenotype[['ICA_min']])
clusters_plus_phenotype[['Subject']] = datMeta_redDim$Subject_ID
clusters_plus_phenotype[['ASD']] = datMeta_redDim$Diagnosis_
clusters_plus_phenotype[['Region']] = datMeta_redDim$Brain_lobe
clusters_plus_phenotype[['Sex']] = datMeta_redDim$Sex
clusters_plus_phenotype[['Age']] = datMeta_redDim$Age

cluster_sim = data.frame(matrix(nrow = length(clusters_plus_phenotype), ncol = length(clusters_plus_phenotype)))
for(i in 1:(length(clusters_plus_phenotype))){
  cluster1 = as.factor(clusters_plus_phenotype[[i]])
  for(j in (i):length(clusters_plus_phenotype)){
    cluster2 = as.factor(clusters_plus_phenotype[[j]])
    cluster_sim[i,j] = adj.rand.index(cluster1, cluster2)
  }
}
colnames(cluster_sim) = names(clusters_plus_phenotype)
rownames(cluster_sim) = colnames(cluster_sim)

cluster_sim = cluster_sim %>% as.matrix %>% round(2)
heatmap.2(x = cluster_sim, Rowv = FALSE, Colv = FALSE, dendrogram = 'none', 
          cellnote = cluster_sim, notecol = 'black', trace = 'none', key = FALSE, 
          cexRow = 1, cexCol = 1, margins = c(7,7))

rm(i, j, cluster1, cluster2, clusters_plus_phenotype, cluster_sim)

Scatter plots

create_2D_plot = function(cat_var){
 ggplotly(plot_points %>% ggplot(aes_string(x='PC1', y='PC2', color=cat_var)) + 
          geom_point() + theme_minimal() + 
          xlab(paste0('PC1 (', round(summary(pca_output)$importance[2,1]*100,2),'%)')) +
          ylab(paste0('PC2 (', round(summary(pca_output)$importance[2,2]*100,2),'%)')))
}
create_3D_plot = function(cat_var){
  plot_points %>% plot_ly(x=~PC1, y=~PC2, z=~PC3) %>% add_markers(color=plot_points[,cat_var], size=1) %>% 
    layout(title = glue('Samples coloured by ', cat_var),
           scene = list(xaxis=list(title=glue('PC1 (',round(summary(pca_output)$importance[2,1]*100,2),'%)')),
                        yaxis=list(title=glue('PC2 (',round(summary(pca_output)$importance[2,2]*100,2),'%)')),
                        zaxis=list(title=glue('PC3 (',round(summary(pca_output)$importance[2,3]*100,2),'%)'))))  
}

plot_points = datExpr_redDim %>% data.frame() %>% dplyr::select(PC1:PC3) %>%
              mutate(ID = rownames(.), subject_ID = datMeta_redDim$Subject_ID,
                km_clust = as.factor(clusterings[['km']]),       hc_clust = as.factor(clusterings[['hc']]),
                cc_l1_clust = as.factor(clusterings[['cc_l1']]), cc_clust = as.factor(clusterings[['cc_l2']]),
                ica_clust = as.factor(clusterings[['ICA_min']]), n_ica_clust = as.factor(rowSums(ICA_clusters)),
                gmm_clust = as.factor(clusterings[['GMM']]),     wgcna_clust = as.factor(clusterings[['WGCNA']]),
                sex = as.factor(datMeta_redDim$Sex),             region = as.factor(datMeta_redDim$Brain_lobe), 
                diagnosis = as.factor(datMeta_redDim$Diagnosis_), age = datMeta_redDim$Age)

2D plots of clusterings

create_2D_plot('km_clust')
create_2D_plot('hc_clust')
create_2D_plot('cc_l1_clust')
create_2D_plot('cc_clust')
create_2D_plot('ica_clust')
create_2D_plot('gmm_clust')
create_2D_plot('wgcna_clust')

2D plots of Phenotypes

create_2D_plot('diagnosis')
create_2D_plot('region')
create_2D_plot('sex')
create_2D_plot('age')

3D plots

create_3D_plot('ica_clust')
create_3D_plot('wgcna_clust')